import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # Loading necessary libraries
import seaborn as sns
import folium
from scipy import stats
# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
# Set the style for our visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline
# Load the dataset
license = pd.read_excel('Atlanta_Business_License_Records_2025.xlsx')
# Display the first few rows
print("First 5 rows of the dataset:")
print(license.head())
# Display basic information about the dataset
print("\nDataset Info:")
license.info()
# Display basic statistical summary
print("\nBasic Statistical Summary:")
print(license.describe())
# Display number of missing values in each column
print("\nMissing Values Count:")
print(license.isnull().sum())
First 5 rows of the dataset:
license_number company_name company_dba \
0 GBL-0121-00257 H & H Interiors LLC NaN
1 GBL-0121-00258 One Lion Studios LLC NaN
2 GBL-0121-00272 Destingkt Designs LLC NaN
3 GBL-0121-00274 Mac Cigars LLC J's Cigars
4 GBL-0121-00283 Essence of She Day Spa Salon NaN
license_classification issued_date \
0 Other Services except Public Administration 2025-01-07 13:21:16.143
1 Other Services except Public Administration 2025-02-16 18:39:21.153
2 Other Services except Public Administration 2025-03-04 23:04:53.980
3 Retail Trade 2025-02-25 23:09:11.497
4 Other Services except Public Administration 2025-01-09 19:04:30.857
naics_code naics_name predirection address_line1 \
0 541410.0 Interior Design Services NE 2652 Forrest
1 541840.0 Media Representatives NaN 3343 Peachtree Rd NE
2 541410.0 Interior Design Services NaN 931
3 453991.0 Tobacco Stores NaN 2072
4 812112.0 Beauty Salons NaN 144
address_line2 street_type postdirection unit_suite city state \
0 NaN WAY NaN NaN Atlanta GA
1 NaN NaN NaN Ste 145-149 Atlanta GA
2 PONCE DE LEON AVE NE NaN Atlanta GA
3 Defoors Ferry RD NW 120 Atlanta GA
4 MORELAND AVE NE UNIT 14A ATLANTA GA
postal_code address_concat \
0 NaN 2652 Forrest nan WAY Atlanta GA nan
1 30326 3343 Peachtree Rd NE nan nan Atlanta GA 30326
2 NaN 931 PONCE DE LEON AVE Atlanta GA nan
3 30318 2072 Defoors Ferry RD Atlanta GA 30318
4 30307 144 MORELAND AVE ATLANTA GA 30307
address_api longitude latitude \
0 2652 Forrest Way NE, Atlanta, Georgia, 30305 -84.379893 33.827640
1 3343 Peachtree Rd NE, Atlanta, Georgia, 30326 -84.367109 33.846760
2 931 Ponce de Leon Ave NE, Atlanta, Georgia, 30306 -84.356939 33.773524
3 2072 Defoors Ferry Rd NW, Atlanta, Georgia, 30318 -84.426318 33.812071
4 144 Moreland Ave NE, Atlanta, Georgia, 30307 -84.349501 33.756604
disinvested_neighborhood council_district npu
0 False 7 B
1 False 7 B
2 False 2 N
3 False 9 D
4 False 5 N
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17623 entries, 0 to 17622
Data columns (total 23 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 license_number 17623 non-null object
1 company_name 17623 non-null object
2 company_dba 4346 non-null object
3 license_classification 17623 non-null object
4 issued_date 17498 non-null datetime64[ns]
5 naics_code 17390 non-null float64
6 naics_name 17390 non-null object
7 predirection 647 non-null object
8 address_line1 17599 non-null object
9 address_line2 13165 non-null object
10 street_type 14879 non-null object
11 postdirection 13972 non-null object
12 unit_suite 7110 non-null object
13 city 17492 non-null object
14 state 17466 non-null object
15 postal_code 16333 non-null object
16 address_concat 17623 non-null object
17 address_api 17623 non-null object
18 longitude 17623 non-null float64
19 latitude 17623 non-null float64
20 disinvested_neighborhood 17623 non-null bool
21 council_district 17078 non-null object
22 npu 17104 non-null object
dtypes: bool(1), datetime64[ns](1), float64(3), object(18)
memory usage: 3.0+ MB
Basic Statistical Summary:
issued_date naics_code longitude \
count 17498 17390.000000 17623.000000
mean 2025-02-26 10:55:24.937929216 568457.671938 -84.254264
min 2024-08-12 11:49:14 111219.000000 -122.746250
25% 2025-02-03 13:34:47.917000192 453998.000000 -84.412296
50% 2025-02-13 15:08:51.181500160 541611.000000 -84.386019
75% 2025-03-07 23:03:36.446500096 721110.000000 -84.367373
max 2025-08-12 23:04:59.943000 928120.000000 101.971252
std NaN 146877.529815 6.591369
latitude
count 17623.000000
mean 33.839936
min 19.524210
25% 33.751885
50% 33.781221
75% 33.818418
max 47.801692
std 0.978027
Missing Values Count:
license_number 0
company_name 0
company_dba 13277
license_classification 0
issued_date 125
naics_code 233
naics_name 233
predirection 16976
address_line1 24
address_line2 4458
street_type 2744
postdirection 3651
unit_suite 10513
city 131
state 157
postal_code 1290
address_concat 0
address_api 0
longitude 0
latitude 0
disinvested_neighborhood 0
council_district 545
npu 519
dtype: int64
Missing Values Analysis¶
The code output shows the number of missing values in each column of the dataset using license.isnull().sum(). Based on the first few rows displayed, several columns contain missing data.
- company_dba has many missing entries because most businesses operate under their official registered name rather than a “Doing Business As” alias.
- predirection, address_line2, street_type, and unit_suite also show missing values, which is expected since not every business address includes a street direction, secondary address, or suite number.
- Columns such as address_concat, address_api, and other derived address fields may also contain missing values if geocoding or data standardization was incomplete.
- Core identifying and business-related variables like license_number, company_name, license_classification, issued_date, and naics_code appear complete, indicating the dataset’s primary structure is intact.
Overall, the missing values are concentrated in optional or location-specific fields rather than critical identifiers. This pattern suggests that the dataset is relatively suitable for analysis, though address-based mapping or spatial analysis may require additional data cleaning and imputation.
# Display all column names
print("All columns in the dataset:")
print("\n".join(license.columns.tolist()))
All columns in the dataset: license_number company_name company_dba license_classification issued_date naics_code naics_name predirection address_line1 address_line2 street_type postdirection unit_suite city state postal_code address_concat address_api longitude latitude disinvested_neighborhood council_district npu
Section 1: Variable Analysis¶
1. Identifiers¶
license_number – Unique identifier for each business license issued by the City of Atlanta.
company_name – Registered name of the business entity holding the license.
company_dba – "Doing Business As" name, used when the business operates under a trade or brand name different from the registered one.
These columns uniquely identify each licensed business.
2. Business Information¶
license_classification – Categorizes the type of business activity, such as Retail Trade, Accommodation and Food Services, or Other Services except Public Administration.
issued_date – Date and time when the license was officially issued in 2025.
naics_code – Numeric code representing the business industry according to the North American Industry Classification System (NAICS).
naics_name – Full description of the NAICS code, detailing the type of business activity (e.g., Interior Design Services, Health Care & Social Assistance).
These variables capture what kind of business activity is licensed and when it became active.
3. Address Components¶
predirection, street_type, postdirection – Standardized address components used for mapping and consistency in street naming (e.g., NE, AVE, ST).
address_line1, address_line2, unit_suite – Main street address, secondary address information (such as building or floor), and unit or suite identifiers for the business location.
city, state, postal_code – Geographic location information identifying where the business operates. All entries are based in Atlanta, GA, with various postal codes.
address_concat, address_api – Concatenated or API-verified address fields used for mapping or geocoding.
These variables allow for accurate spatial analysis and mapping of business locations across the city.
4. Geospatial Variables¶
longitude, latitude – Geographic coordinates of the business location, used for geospatial visualization and analysis.
These enable visualization of business distributions and proximity to different neighborhoods or infrastructure.
5. Administrative and Neighborhood Indicators¶
disinvested_neighborhood – Boolean indicator (TRUE/FALSE) denoting whether the business is located in a historically disinvested or economically disadvantaged neighborhood.
council_district – Numeric and letter-coded variable showing the City Council district where the business operates (e.g., 7 B, 4 M).
npu – Neighborhood Planning Unit (NPU) designation, representing local community zones used for planning and public engagement (e.g., E, B, M, L).
These features support equity-focused or policy analyses by connecting businesses to governance and socioeconomic geography.
Section 2: Business Distribution and Counts Visualizations¶
Bar chart of license_classification
Show how many businesses fall under each classification (e.g., Retail Trade, Food Services, Other Services).
Purpose: Identify dominant sectors among newly licensed small businesses in Atlanta.Bar chart of naics_name
Display the top 10 most common business types using NAICS names.
Purpose: Highlight the most frequent business activities (e.g., beauty salons, restaurants, design services).Histogram of issued_date
Plot licenses by issuance month or week.
Purpose: Examine temporal trends in new business formation throughout 2025.
# Set figure size for better visualization
plt.figure(figsize=(15, 6))
# Create bar chart of license classifications
sns.countplot(data=license, y='license_classification', order=license['license_classification'].value_counts().index)
plt.title('Distribution of Business Licenses by Classification')
plt.xlabel('Number of Businesses')
plt.ylabel('License Classification')
# Adjust layout
plt.tight_layout()
plt.show()
# Set figure size for better visualization
plt.figure(figsize=(15, 6))
# Create bar chart of top 10 NAICS names
top_10_naics = license['naics_name'].value_counts().head(10)
sns.barplot(x=top_10_naics.values, y=top_10_naics.index)
plt.title('Top 10 Most Common Business Types (NAICS)')
plt.xlabel('Number of Businesses')
plt.ylabel('NAICS Business Type')
# Adjust layout
plt.tight_layout()
plt.show()
# Convert issued_date to datetime
license['issued_date'] = pd.to_datetime(license['issued_date'])
# Set figure size
plt.figure(figsize=(15, 6))
# Create histogram of issued dates by month
sns.histplot(data=license, x='issued_date', bins=50)
plt.title('Distribution of Business Licenses by Issue Date')
plt.xlabel('Issue Date')
plt.ylabel('Number of Licenses')
# Rotate x-axis labels for better readability
plt.xticks(rotation=45)
# Adjust layout
plt.tight_layout()
plt.show()
# Additional monthly trend analysis
monthly_counts = license['issued_date'].dt.to_period('M').value_counts().sort_index()
plt.figure(figsize=(15, 6))
monthly_counts.plot(kind='bar')
plt.title('Number of Licenses Issued by Month')
plt.xlabel('Month')
plt.ylabel('Number of Licenses')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Section 3: Geographic and Spatial Analysis Visualizations¶
Scatter map (longitude vs. latitude)
Plot all business locations on a city map of Atlanta using longitude and latitude coordinates.
Purpose: Visualize spatial distribution and clustering of businesses.Choropleth map by council_district or NPU
Aggregate the number of businesses per district or neighborhood planning unit (NPU).
Purpose: Identify areas with high or low small-business density.Highlight disinvested_neighborhood
Use color to differentiate businesses located in disinvested neighborhoods (TRUE/FALSE).
Purpose: Compare economic activity between historically underinvested areas and others.
# Create a base map centered on Atlanta
atlanta_center = [33.7490, -84.3880]
m = folium.Map(location=atlanta_center, zoom_start=11)
# Create a scatter plot of all business locations
for idx, row in license.iterrows():
try:
folium.CircleMarker(
location=[float(row['latitude']), float(row['longitude'])],
radius=3,
color='red',
fill=True,
popup=str(row['company_name'])
).add_to(m)
except (ValueError, TypeError):
continue # Skip any rows with invalid coordinates
# Save the map
m.save('atlanta_business_locations.html')
# Display the map
display(m)